There is wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2018.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2018 can be accessed at page https://www.imdb.com/search/title?count=100&release_date=2018,2018&title_type=feature.
#Loading the rvest and tidyverse package
library("rvest")
## Loading required package: xml2
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2018,2018&title_type=feature'
#Reading the HTML code from the website
(webpage <- read_html(url))
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="styleguide-v2" class="fixed">\n\n <img height=" ...
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100Use SelectorGadget to find the CSS selector .lister-item-header a.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt1727824/?ref_=adv_li_tt">Bohemian Rhapsody</a>
## [2] <a href="/title/tt4530422/?ref_=adv_li_tt">Overlord</a>
## [3] <a href="/title/tt6966692/?ref_=adv_li_tt">Green Book</a>
## [4] <a href="/title/tt1477834/?ref_=adv_li_tt">Aquaman</a>
## [5] <a href="/title/tt5083738/?ref_=adv_li_tt">The Favourite</a>
## [6] <a href="/title/tt1517451/?ref_=adv_li_tt">A Star Is Born</a>
## [7] <a href="/title/tt3606756/?ref_=adv_li_tt">Incredibles 2</a>
## [8] <a href="/title/tt1571234/?ref_=adv_li_tt">Mortal Engines</a>
## [9] <a href="/title/tt4218572/?ref_=adv_li_tt">Widows</a>
## [10] <a href="/title/tt4532826/?ref_=adv_li_tt">Robin Hood</a>
## [11] <a href="/title/tt6155172/?ref_=adv_li_tt">Roma</a>
## [12] <a href="/title/tt6266538/?ref_=adv_li_tt">Vice</a>
## [13] <a href="/title/tt4633694/?ref_=adv_li_tt">Spider-Man: Into the Spi ...
## [14] <a href="/title/tt4154756/?ref_=adv_li_tt">Avengers: Infinity War</a>
## [15] <a href="/title/tt7349662/?ref_=adv_li_tt">BlacKkKlansman</a>
## [16] <a href="/title/tt2737304/?ref_=adv_li_tt">Bird Box</a>
## [17] <a href="/title/tt4595882/?ref_=adv_li_tt">Can You Ever Forgive Me? ...
## [18] <a href="/title/tt1034415/?ref_=adv_li_tt">Suspiria</a>
## [19] <a href="/title/tt5095030/?ref_=adv_li_tt">Ant-Man and the Wasp</a>
## [20] <a href="/title/tt8359848/?ref_=adv_li_tt">Climax</a>
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "Bohemian Rhapsody"
## [2] "Overlord"
## [3] "Green Book"
## [4] "Aquaman"
## [5] "The Favourite"
## [6] "A Star Is Born"
## [7] "Incredibles 2"
## [8] "Mortal Engines"
## [9] "Widows"
## [10] "Robin Hood"
## [11] "Roma"
## [12] "Vice"
## [13] "Spider-Man: Into the Spider-Verse"
## [14] "Avengers: Infinity War"
## [15] "BlacKkKlansman"
## [16] "Bird Box"
## [17] "Can You Ever Forgive Me?"
## [18] "Suspiria"
## [19] "Ant-Man and the Wasp"
## [20] "Climax"
## [21] "The Mule"
## [22] "The Man Who Killed Hitler and Then The Bigfoot"
## [23] "First Man"
## [24] "Black Panther"
## [25] "Hunter Killer"
## [26] "The Girl in the Spider's Web"
## [27] "Venom"
## [28] "Bumblebee"
## [29] "Bad Times at the El Royale"
## [30] "The Ballad of Buster Scruggs"
## [31] "Mary Queen of Scots"
## [32] "Uncle Drew"
## [33] "Solo: A Star Wars Story"
## [34] "Dragon Ball Super: Broly"
## [35] "A Quiet Place"
## [36] "Fantastic Beasts: The Crimes of Grindelwald"
## [37] "Ready Player One"
## [38] "Arctic"
## [39] "A Simple Favor"
## [40] "A Private War"
## [41] "Deadpool 2"
## [42] "The Front Runner"
## [43] "The Grinch"
## [44] "Boy Erased"
## [45] "At Eternity's Gate"
## [46] "Todos lo saben"
## [47] "Tag"
## [48] "Prospect"
## [49] "Mary Poppins Returns"
## [50] "Beautiful Boy"
## [51] "Annihilation"
## [52] "Crazy Rich Asians"
## [53] "Cold War"
## [54] "Mission: Impossible - Fallout"
## [55] "If Beale Street Could Talk"
## [56] "Hereditary"
## [57] "The Nutcracker and the Four Realms"
## [58] "Instant Family"
## [59] "Halloween"
## [60] "Burning"
## [61] "Ralph Breaks the Internet"
## [62] "Ocean's 8"
## [63] "The Sisters Brothers"
## [64] "Creed II"
## [65] "The Predator"
## [66] "Hotel Transylvania 3: Summer Vacation"
## [67] "Replicas"
## [68] "Stan & Ollie"
## [69] "The Hate U Give"
## [70] "On the Basis of Sex"
## [71] "Destroyer"
## [72] "Upgrade"
## [73] "Red Sparrow"
## [74] "The Guilty"
## [75] "The House That Jack Built"
## [76] "Isle of Dogs"
## [77] "Searching"
## [78] "The Old Man & the Gun"
## [79] "Nobody's Fool"
## [80] "Game Night"
## [81] "Capharnaüm"
## [82] "Sicario: Day of the Soldado"
## [83] "Dumplin'"
## [84] "The Meg"
## [85] "Black Mirror: Bandersnatch"
## [86] "Rampage"
## [87] "The Nun"
## [88] "Untogether"
## [89] "To All the Boys I've Loved Before"
## [90] "Then Came You"
## [91] "Jurassic World: Fallen Kingdom"
## [92] "Little Italy"
## [93] "K.G.F: Chapter 1"
## [94] "Eighth Grade"
## [95] "Love, Simon"
## [96] "Peranbu"
## [97] "High Life"
## [98] "Mile 22"
## [99] "Mowgli: Legend of the Jungle"
## [100] "Mandy"# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\n The story of the legendary rock band <a ...
## [2] <p class="text-muted">\n A small group of American soldiers find ...
## [3] <p class="text-muted">\n A working-class Italian-American bounce ...
## [4] <p class="text-muted">\n Arthur Curry, the human-born heir to th ...
## [5] <p class="text-muted">\n In early 18th century England, a frail ...
## [6] <p class="text-muted">\n A musician helps a young singer find fa ...
## [7] <p class="text-muted">\n The Incredibles hero family takes on a ...
## [8] <p class="text-muted">\n In a post-apocalyptic world where citie ...
## [9] <p class="text-muted">\n Set in contemporary Chicago, amid a tim ...
## [10] <p class="text-muted">\n A war-hardened Crusader and his Moorish ...
## [11] <p class="text-muted">\n A year in the life of a middle-class fa ...
## [12] <p class="text-muted">\n The story of <a href="/name/nm0155515"> ...
## [13] <p class="text-muted">\n Teen Miles Morales becomes Spider-Man o ...
## [14] <p class="text-muted">\n The Avengers and their allies must be w ...
## [15] <p class="text-muted">\n Ron Stallworth, an African American pol ...
## [16] <p class="text-muted">\n Five years after an ominous unseen pres ...
## [17] <p class="text-muted">\n When Lee Israel falls out of step with ...
## [18] <p class="text-muted">\n A darkness swirls at the center of a wo ...
## [19] <p class="text-muted">\n As Scott Lang balances being both a Sup ...
## [20] <p class="text-muted">\n French dancers gather in a remote, empt ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\n The story of the legendary rock band Queen and lead singer Freddie Mercury, leading up to their famous performance at Live Aid (1985)."
## [2] "\n A small group of American soldiers find horror behind enemy lines on the eve of D-Day."
## [3] "\n A working-class Italian-American bouncer becomes the driver of an African-American classical pianist on a tour of venues through the 1960s American South."
## [4] "\n Arthur Curry, the human-born heir to the underwater kingdom of Atlantis, goes on a quest to prevent a war between the worlds of ocean and land."
## [5] "\n In early 18th century England, a frail Queen Anne occupies the throne and her close friend, Lady Sarah, governs the country in her stead. When a new servant, Abigail, arrives, her charm endears her to Sarah."
## [6] "\n A musician helps a young singer find fame, even as age and alcoholism send his own career into a downward spiral."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n\\s+", "")
head(description_data)
## [1] "The story of the legendary rock band Queen and lead singer Freddie Mercury, leading up to their famous performance at Live Aid (1985)."
## [2] "A small group of American soldiers find horror behind enemy lines on the eve of D-Day."
## [3] "A working-class Italian-American bouncer becomes the driver of an African-American classical pianist on a tour of venues through the 1960s American South."
## [4] "Arthur Curry, the human-born heir to the underwater kingdom of Atlantis, goes on a quest to prevent a war between the worlds of ocean and land."
## [5] "In early 18th century England, a frail Queen Anne occupies the throne and her close friend, Lady Sarah, governs the country in her stead. When a new servant, Abigail, arrives, her charm endears her to Sarah."
## [6] "A musician helps a young singer find fame, even as age and alcoholism send his own career into a downward spiral."# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.text-muted .runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
## [1] "134 min" "110 min" "130 min" "143 min" "119 min" "136 min"
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 134 110 130 143 119 136# Using CSS selectors to scrap the Movie genre section
genre_data_html <- html_nodes(webpage, '.genre')
# Converting the genre data to text
genre_data <- html_text(genre_data_html)
# Let's have a look at the genre data
head(genre_data)
## [1] "\nBiography, Drama, Music "
## [2] "\nAction, Adventure, Horror "
## [3] "\nBiography, Comedy, Drama "
## [4] "\nAction, Adventure, Fantasy "
## [5] "\nBiography, Comedy, Drama "
## [6] "\nDrama, Music, Romance "
# Data-Preprocessing: retrieve the first word
genre_data <- str_extract(genre_data, "[:alpha:]+")
# Convering each genre from text to factor
#genre_data <- as.factor(genre_data)
# Let's have another look at the genre data
head(genre_data)
## [1] "Biography" "Action" "Biography" "Action" "Biography" "Drama"# Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
# Converting the ratings data to text
rating_data <- html_text(rating_data_html)
# Let's have a look at the ratings
head(rating_data)
## [1] "8.2" "6.9" "8.3" "7.4" "7.8" "7.9"
# Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
# Let's have another look at the ratings data
rating_data
## [1] 8.2 6.9 8.3 7.4 7.8 7.9 7.8 6.2 7.1 5.3 7.9 7.1 8.7 8.5 7.5 6.7 7.3
## [18] 7.0 7.1 7.4 7.2 5.6 7.4 7.4 6.6 6.1 6.8 7.1 7.2 7.3 6.5 5.7 7.0 8.3
## [35] 7.6 6.8 7.5 7.3 6.9 6.7 7.8 6.3 6.3 7.0 7.0 7.0 6.6 6.3 7.2 7.3 6.9
## [52] 7.0 7.7 7.8 7.6 7.3 5.5 7.6 6.7 7.7 7.3 6.2 7.0 7.6 5.4 6.3 5.4 7.6
## [69] 7.1 6.5 6.7 7.6 6.6 7.6 7.0 7.9 7.7 6.8 4.4 7.0 8.3 7.1 6.8 5.7 7.4
## [86] 6.1 5.4 6.1 7.3 7.1 6.2 5.7 8.7 7.5 7.7 9.8 6.7 6.1 6.5 6.6# Using CSS selectors to scrap the votes section
votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
# Converting the votes data to text
votes_data <- html_text(votes_data_html)
# Let's have a look at the votes data
head(votes_data)
## [1] "243,586" "29,502" "60,299" "163,401" "49,956" "176,004"
# Data-Preprocessing: removing commas
votes_data <- str_replace(votes_data, ",", "")
# Data-Preprocessing: converting votes to numerical
votes_data <- as.numeric(votes_data)
#Let's have another look at the votes data
votes_data
## [1] 243586 29502 60299 163401 49956 176004 174990 37731 41463 23439
## [11] 72768 30339 94345 583827 104314 180411 11262 23543 193491 11735
## [21] 18408 1232 93865 459617 23404 18162 231405 50000 65108 69619
## [31] 9622 8117 203450 11078 273211 117648 277217 1162 67509 4696
## [41] 346551 2933 23087 11258 6066 11174 71171 2168 32809 21285
## [51] 210690 77841 22008 213665 9549 130030 13521 10177 75010 13510
## [61] 45575 126561 19659 34656 82737 36241 9539 6079 12227 4003
## [71] 3449 89560 127515 17419 24663 95891 78890 18486 2231 149002
## [81] 4659 82922 14618 97132 79700 105575 78115 428 53844 690
## [91] 197254 4490 17729 33393 74787 8213 1227 40850 39279 36982# Using CSS selectors to scrap the directors section
(directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm0001741/?ref_=adv_li_dr_0">Bryan Singer</a>
## [2] <a href="/name/nm1170339/?ref_=adv_li_dr_0">Julius Avery</a>
## [3] <a href="/name/nm0268380/?ref_=adv_li_dr_0">Peter Farrelly</a>
## [4] <a href="/name/nm1490123/?ref_=adv_li_dr_0">James Wan</a>
## [5] <a href="/name/nm0487166/?ref_=adv_li_dr_0">Yorgos Lanthimos</a>
## [6] <a href="/name/nm0177896/?ref_=adv_li_dr_0">Bradley Cooper</a>
## [7] <a href="/name/nm0083348/?ref_=adv_li_dr_0">Brad Bird</a>
## [8] <a href="/name/nm0729514/?ref_=adv_li_dr_0">Christian Rivers</a>
## [9] <a href="/name/nm2588606/?ref_=adv_li_dr_0">Steve McQueen</a>
## [10] <a href="/name/nm1163264/?ref_=adv_li_dr_0">Otto Bathurst</a>
## [11] <a href="/name/nm0190859/?ref_=adv_li_dr_0">Alfonso Cuarón</a>
## [12] <a href="/name/nm0570912/?ref_=adv_li_dr_0">Adam McKay</a>
## [13] <a href="/name/nm2130108/?ref_=adv_li_dr_0">Bob Persichetti</a>
## [14] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
## [15] <a href="/name/nm0000490/?ref_=adv_li_dr_0">Spike Lee</a>
## [16] <a href="/name/nm0081540/?ref_=adv_li_dr_0">Susanne Bier</a>
## [17] <a href="/name/nm1716636/?ref_=adv_li_dr_0">Marielle Heller</a>
## [18] <a href="/name/nm0345174/?ref_=adv_li_dr_0">Luca Guadagnino</a>
## [19] <a href="/name/nm0715636/?ref_=adv_li_dr_0">Peyton Reed</a>
## [20] <a href="/name/nm0637615/?ref_=adv_li_dr_0">Gaspar Noé</a>
## ...
# Converting the directors data to text
directors_data <- html_text(directors_data_html)
# Let's have a look at the directors data
head(directors_data)
## [1] "Bryan Singer" "Julius Avery" "Peter Farrelly"
## [4] "James Wan" "Yorgos Lanthimos" "Bradley Cooper"
# Data-Preprocessing: converting directors data into factors
(directors_data <- as.factor(directors_data))
## [1] Bryan Singer Julius Avery Peter Farrelly
## [4] James Wan Yorgos Lanthimos Bradley Cooper
## [7] Brad Bird Christian Rivers Steve McQueen
## [10] Otto Bathurst Alfonso Cuarón Adam McKay
## [13] Bob Persichetti Anthony Russo Spike Lee
## [16] Susanne Bier Marielle Heller Luca Guadagnino
## [19] Peyton Reed Gaspar Noé Clint Eastwood
## [22] Robert D. Krzykowski Damien Chazelle Ryan Coogler
## [25] Donovan Marsh Fede Alvarez Ruben Fleischer
## [28] Travis Knight Drew Goddard Ethan Coen
## [31] Josie Rourke Charles Stone III Ron Howard
## [34] Tatsuya Nagamine John Krasinski David Yates
## [37] Steven Spielberg Joe Penna Paul Feig
## [40] Matthew Heineman David Leitch Jason Reitman
## [43] Yarrow Cheney Joel Edgerton Julian Schnabel
## [46] Asghar Farhadi Jeff Tomsic Christopher Caldwell
## [49] Rob Marshall Felix van Groeningen Alex Garland
## [52] Jon M. Chu Pawel Pawlikowski Christopher McQuarrie
## [55] Barry Jenkins Ari Aster Lasse Hallström
## [58] Sean Anders David Gordon Green Chang-dong Lee
## [61] Phil Johnston Gary Ross Jacques Audiard
## [64] Steven Caple Jr. Shane Black Genndy Tartakovsky
## [67] Jeffrey Nachmanoff Jon S. Baird George Tillman Jr.
## [70] Mimi Leder Karyn Kusama Leigh Whannell
## [73] Francis Lawrence Gustav Möller Lars von Trier
## [76] Wes Anderson Aneesh Chaganty David Lowery
## [79] Tyler Perry John Francis Daley Nadine Labaki
## [82] Stefano Sollima Anne Fletcher Jon Turteltaub
## [85] David Slade Brad Peyton Corin Hardy
## [88] Emma Forrest Susan Johnson Peter Hutchings
## [91] J.A. Bayona Donald Petrie Prashanth Neel
## [94] Bo Burnham Greg Berlanti Ram
## [97] Claire Denis Peter Berg Andy Serkis
## [100] Panos Cosmatos
## 100 Levels: Adam McKay Alex Garland Alfonso Cuarón ... Yorgos Lanthimos# Using CSS selectors to scrap the actors section
(actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm1785339/?ref_=adv_li_st_0">Rami Malek</a>
## [2] <a href="/name/nm5381254/?ref_=adv_li_st_0">Jovan Adepo</a>
## [3] <a href="/name/nm0001557/?ref_=adv_li_st_0">Viggo Mortensen</a>
## [4] <a href="/name/nm0597388/?ref_=adv_li_st_0">Jason Momoa</a>
## [5] <a href="/name/nm1469236/?ref_=adv_li_st_0">Olivia Colman</a>
## [6] <a href="/name/nm3078932/?ref_=adv_li_st_0">Lady Gaga</a>
## [7] <a href="/name/nm0005266/?ref_=adv_li_st_0">Craig T. Nelson</a>
## [8] <a href="/name/nm2623492/?ref_=adv_li_st_0">Hera Hilmar</a>
## [9] <a href="/name/nm0205626/?ref_=adv_li_st_0">Viola Davis</a>
## [10] <a href="/name/nm5473782/?ref_=adv_li_st_0">Taron Egerton</a>
## [11] <a href="/name/nm8611957/?ref_=adv_li_st_0">Yalitza Aparicio</a>
## [12] <a href="/name/nm0000288/?ref_=adv_li_st_0">Christian Bale</a>
## [13] <a href="/name/nm4271336/?ref_=adv_li_st_0">Shameik Moore</a>
## [14] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
## [15] <a href="/name/nm0913475/?ref_=adv_li_st_0">John David Washington</a>
## [16] <a href="/name/nm0000113/?ref_=adv_li_st_0">Sandra Bullock</a>
## [17] <a href="/name/nm0565250/?ref_=adv_li_st_0">Melissa McCarthy</a>
## [18] <a href="/name/nm1631269/?ref_=adv_li_st_0">Chloë Grace Moretz</a>
## [19] <a href="/name/nm0748620/?ref_=adv_li_st_0">Paul Rudd</a>
## [20] <a href="/name/nm1154749/?ref_=adv_li_st_0">Sofia Boutella</a>
## ...
# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
# Let's have a look at the actors data
head(actors_data)
## [1] "Rami Malek" "Jovan Adepo" "Viggo Mortensen" "Jason Momoa"
## [5] "Olivia Colman" "Lady Gaga"
# Data-Preprocessing: converting actors data into factors
(actors_data <- as.factor(actors_data))
## [1] Rami Malek Jovan Adepo Viggo Mortensen
## [4] Jason Momoa Olivia Colman Lady Gaga
## [7] Craig T. Nelson Hera Hilmar Viola Davis
## [10] Taron Egerton Yalitza Aparicio Christian Bale
## [13] Shameik Moore Robert Downey Jr. John David Washington
## [16] Sandra Bullock Melissa McCarthy Chloë Grace Moretz
## [19] Paul Rudd Sofia Boutella Bradley Cooper
## [22] Sam Elliott Ryan Gosling Chadwick Boseman
## [25] Gerard Butler Claire Foy Tom Hardy
## [28] Hailee Steinfeld Jeff Bridges Tim Blake Nelson
## [31] Saoirse Ronan Kyrie Irving Alden Ehrenreich
## [34] Masako Nozawa Emily Blunt Eddie Redmayne
## [37] Tye Sheridan Mads Mikkelsen Anna Kendrick
## [40] Rosamund Pike Ryan Reynolds Hugh Jackman
## [43] Benedict Cumberbatch Lucas Hedges Willem Dafoe
## [46] Penélope Cruz Jeremy Renner Sophie Thatcher
## [49] Emily Blunt Steve Carell Natalie Portman
## [52] Constance Wu Joanna Kulig Tom Cruise
## [55] KiKi Layne Toni Collette Mackenzie Foy
## [58] Mark Wahlberg Jamie Lee Curtis Ah-in Yoo
## [61] John C. Reilly Sandra Bullock John C. Reilly
## [64] Michael B. Jordan Boyd Holbrook Adam Sandler
## [67] Alice Eve Shirley Henderson Amandla Stenberg
## [70] Felicity Jones Nicole Kidman Logan Marshall-Green
## [73] Jennifer Lawrence Jakob Cedergren Matt Dillon
## [76] Bryan Cranston John Cho Robert Redford
## [79] Tiffany Haddish Jason Bateman Zain Al Rafeea
## [82] Benicio Del Toro Danielle Macdonald Jason Statham
## [85] Fionn Whitehead Dwayne Johnson Demián Bichir
## [88] Alice Eve Lana Condor Asa Butterfield
## [91] Chris Pratt Hayden Christensen Yash
## [94] Elsie Fisher Nick Robinson Mammootty
## [97] Robert Pattinson Mark Wahlberg Christian Bale
## [100] Nicolas Cage
## 94 Levels: Adam Sandler Ah-in Yoo Alden Ehrenreich ... Zain Al Rafeea# Using CSS selectors to scrap the metascore section
metascore_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
# Let's have a look at the metascore
head(metascore_data)
## [1] "49 " "60 " "69 " "55 " "90 "
## [6] "88 "
# Data-Preprocessing: removing extra space in metascore
metascore_data <- str_replace(metascore_data, "\\s*$", "")
metascore_data <- as.numeric(metascore_data)
metascore_data
## [1] 49 60 69 55 90 88 80 44 84 32 96 61 87 68 83 51 87 64 70 83 58 50 84
## [24] 88 43 43 35 66 60 79 60 57 62 59 82 52 64 71 67 75 66 61 51 70 78 66
## [47] 56 68 66 62 79 74 90 86 87 87 39 57 67 90 71 61 78 66 48 54 19 75 81
## [70] 60 62 67 53 83 42 82 71 80 39 66 75 61 53 46 45 46 45 64 43 51 28 90
## [93] 72 81 38 51 81
# Lets check the length of metascore data
length(metascore_data)
## [1] 97
# Visual inspection finds 69, 74, 87 don't have metascore
ms <- rep(NA, 100)
ms[-c(69, 74, 87)] <- metascore_data
(metascore_data <- ms)
## [1] 49 60 69 55 90 88 80 44 84 32 96 61 87 68 83 51 87 64 70 83 58 50 84
## [24] 88 43 43 35 66 60 79 60 57 62 59 82 52 64 71 67 75 66 61 51 70 78 66
## [47] 56 68 66 62 79 74 90 86 87 87 39 57 67 90 71 61 78 66 48 54 19 75 NA
## [70] 81 60 62 67 NA 53 83 42 82 71 80 39 66 75 61 53 46 NA 45 46 45 64 43
## [93] 51 28 90 72 81 38 51 81# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the votes data
head(gross_data)
## [1] "$210.57M" "$21.70M" "$61.38M" "$328.45M" "$30.21M" "$208.73M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 6)
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
length(gross_data)
## [1] 85
# Visual inspection finds below movies don't have gross
gs_data <- rep(NA, 100)
gs_data[-c(6, 12, 29, 40, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
(gross_data <- gs_data)
## [1] 210.50 21.70 61.38 328.40 30.21 NA 208.70 608.50 15.95 42.39
## [11] 30.82 NA 45.22 179.80 678.80 48.69 8.54 2.47 216.60 102.60
## [21] 44.94 700.00 15.77 14.84 213.50 125.70 17.84 16.47 NA 42.47
## [31] 213.70 30.38 188.00 159.40 137.60 0.16 53.54 1.63 324.50 NA
## [41] 2.00 270.60 6.79 2.29 0.07 54.55 169.70 7.65 32.73 174.50
## [51] 2.90 220.10 13.77 44.07 54.86 67.36 159.30 0.70 197.50 139.30
## [61] NA 3.14 115.60 51.02 167.50 4.04 4.30 29.72 NA 23.80
## [71] NA 1.46 11.87 NA 46.87 0.21 0.09 NA 32.02 26.02
## [81] 11.28 NA 31.71 NA NA NA NA 69.00 0.74 NA
## [91] 50.07 145.40 99.35 117.40 417.70 0.99 13.54 40.83 36.11 1.21Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 x 11
## Rank Title
## <int> <chr>
## 1 1 Bohemian Rhapsody
## 2 2 Overlord
## 3 3 Green Book
## 4 4 Aquaman
## 5 5 The Favourite
## 6 6 A Star Is Born
## 7 7 Incredibles 2
## 8 8 Mortal Engines
## 9 9 Widows
## 10 10 Robin Hood
## Description
## <chr>
## 1 The story of the legendary rock band Queen and lead singer Freddie Merc…
## 2 A small group of American soldiers find horror behind enemy lines on th…
## 3 A working-class Italian-American bouncer becomes the driver of an Afric…
## 4 Arthur Curry, the human-born heir to the underwater kingdom of Atlantis…
## 5 In early 18th century England, a frail Queen Anne occupies the throne a…
## 6 A musician helps a young singer find fame, even as age and alcoholism s…
## 7 The Incredibles hero family takes on a new mission, which involves a ch…
## 8 In a post-apocalyptic world where cities ride on wheels and consume eac…
## 9 Set in contemporary Chicago, amid a time of turmoil, four women with no…
## 10 A war-hardened Crusader and his Moorish commander mount an audacious re…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 134 Biography 8.2 49 243586 210.
## 2 110 Action 6.9 60 29502 21.7
## 3 130 Biography 8.3 69 60299 61.4
## 4 143 Action 7.4 55 163401 328.
## 5 119 Biography 7.8 90 49956 30.2
## 6 136 Drama 7.9 88 176004 NA
## 7 118 Animation 7.8 80 174990 209.
## 8 128 Action 6.2 44 37731 608.
## 9 129 Crime 7.1 84 41463 16.0
## 10 116 Action 5.3 32 23439 42.4
## Director Actor
## <fct> <fct>
## 1 Bryan Singer Rami Malek
## 2 Julius Avery Jovan Adepo
## 3 Peter Farrelly Viggo Mortensen
## 4 James Wan Jason Momoa
## 5 Yorgos Lanthimos Olivia Colman
## 6 Bradley Cooper Lady Gaga
## 7 Brad Bird Craig T. Nelson
## 8 Christian Rivers Hera Hilmar
## 9 Steve McQueen Viola Davis
## 10 Otto Bathurst Taron Egerton
## # … with 90 more rowsHow many top 100 movies are in each genre?
ggplot(movies) +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
(earn_by_genre <- movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)))
## # A tibble: 10 x 2
## Genre avg_earning
## <chr> <dbl>
## 1 Action 103.
## 2 Adventure 147.
## 3 Animation 70.7
## 4 Biography 132.
## 5 Comedy 130.
## 6 Crime 43.4
## 7 Drama 45.5
## 8 Fantasy 2.47
## 9 Horror 198.
## 10 Sci 4.3
ggplot(data = earn_by_genre) +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
## # A tibble: 10 x 11
## # Groups: Genre [10]
## Rank Title Description Runtime Genre Rating Metascore Votes
## <int> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 7 Incr… The Incred… 118 Anim… 7.8 80 174990
## 2 8 Mort… In a post-… 128 Acti… 6.2 44 37731
## 3 15 Blac… Ron Stallw… 135 Biog… 7.5 83 104314
## 4 18 Susp… A darkness… 152 Fant… 7 64 23543
## 5 22 The … A legendar… 98 Adve… 5.6 50 1232
## 6 55 If B… A woman in… 119 Crime 7.6 87 9549
## 7 59 Hall… Laurie Str… 106 Horr… 6.7 67 75010
## 8 60 Burn… Jong-su bu… 148 Drama 7.7 90 13510
## 9 67 Repl… A scientis… 107 Sci 5.4 19 9539
## 10 95 Love… Simon Spie… 110 Come… 7.7 90 74787
## # … with 3 more variables: Gross_Earning_in_Mil <dbl>, Director <fct>,
## # Actor <fct>
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 15 rows containing missing values (geom_point).
Read blog https://www.r-bloggers.com/how-to-scrape-images-from-google/
For example, to download first 10 images from search term “Adidas logo”
source("scrapeGoogleImages.R")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
### exchange the search terms here!
(gg <- scrapeJSSite(searchTerm = "ucla"))
## images
## 1 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSPAHGngfC90w8TVGwxDtF7EOfHRruNfeHqVicOCZHfcFZ28aN9AGMPOrE7
## 2 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQjKYK2GAUPobVyiIe5pKGd9EGrqLaWIdw3a7XskhrW2ezEcFrcWn94-5Pq
## 3 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQzx04u2Ub5IC4UqGukXpcq2Gsozo52nJeQOaI7b8JzgHmTdCkGPgKl3OGX
## 4 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS1OecHJM-b8-NAvWfsoi8xuOtoHfibWWGCkQ21ZbQM_BELXdODyq5ZAQ
## 5 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRi4lynSHbXZ4Iw8g2dqSWIHUbwYlVAnCG8JmoJk0m5TDqv7u1A4DZXIXo
## 6 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ84ERmbLNfXOn48-fXI3kVLCqjBv-mRS1c5y2yinRX1MSy2PuD6ZVIcj7r
## 7 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQtayay8IIEhDm5NyjKj_1Jir0j1YKdHHcUoUJaI0MYV5t7WcK7iK0e_bfN
## 8 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS7PJWOnMu2-F2ZOcKYLiHF6EJM4ddYQas6W7_nEOqv4kmyo5ANyJK-wy1g
## 9 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSi7wafHAE8gTQM7LWdMDi49sfwV9bp4n2-l6MdJ3pODMLa-z83zQZ4j2FY
## 10 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRMEqWVOjAacm0LJTgyltC9U1l9RaZLguWRRBGFfepHhGaRoMaEs2v5iQ
## 11 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQgDqKCP2Uj7dmIqDKLCzwnO2Nxe4NKkxqi7yzULvq5yIvx9AM5Let8VSfF
## 12 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQV1joSR-AfZLeCI-glojukNQZjylMEGb7C5vHTc1ZOq-PbJGYreRQXeVY
## 13 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTI-oSi_FSFDeqOxkX1sse5D2Q-yfDKE28MMq3lOh1B9LckMTTYi-JxJIC8
## 14 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT6SWyqwFbDz5WNWBSbZXABGGTNxP122uQeg1skNxBpv5IBeMCGX4NiSK4
## 15 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcuXKyr-IO6FRGq4mKUqfaZMmVUKdqOFGkC74VFxLv3atIlywjhMvLay5Z
## 16 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRT1KLpkaIjTkm0O1zSHvILuNkk8lZfiUZoJqwD6AH8k1OhuZcedK1xUE
## 17 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT6Vr40BZfh0xMbWCgMRLlHZgPIuZtrEsGMM4FyGu3kEcGLrauySTKCjZ2_
## 18 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQE3k_c0rFd-J7mOLS19TZ4crq6Bx9vXI3gNGPklgoVArW3skuai1ljggub
## 19 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQsP-A6JtaJjfe33HsF5Ng7yRa4HRalvfCBNjERoFxWhvFL_ALJ3UcYtCno
## 20 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQn0CjOdq67t96Yp2RkG-vWwRn0-bCQN6E2zv23zlKs1iy-Jvu0IoPo-Q
## search
## 1 ucla
## 2 ucla
## 3 ucla
## 4 ucla
## 5 ucla
## 6 ucla
## 7 ucla
## 8 ucla
## 9 ucla
## 10 ucla
## 11 ucla
## 12 ucla
## 13 ucla
## 14 ucla
## 15 ucla
## 16 ucla
## 17 ucla
## 18 ucla
## 19 ucla
## 20 ucla
downloadImages(as.character(gg$images), 1)
quantmod package contains many utility functions for retrieving and plotting finance data. E.g.,
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
stock <- getSymbols("AAPL", src = "yahoo", auto.assign = FALSE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
##
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
chartSeries(stock, theme = chartTheme("white"),
type = "line", log.scale = FALSE, TA = NULL)